Get crime data from ArcGIS API and remove (4) entries with missing geometry; write cleaned data to GeoJSON

# crime_dg <- st_read("https://utility.arcgis.com/usrsvcs/servers/0eaa6be357a74f5280157125e9b547fc/rest/services/OpenData_PublicSafety/FeatureServer/2/query?outFields=*&where=1%3D1&f=geojson")
# crime_dg$Date <- as.Date(fread(".//data//crime_dg.csv")$Date, "%Y/%m/%d")
# crime_dg <- crime_dg[!st_is_empty(crime_dg),]
# st_write(crime_dg, ".//data//crime_dg.geojson")

Read in data

crime_dg <- st_read(".//data//crime_dg.geojson")
## Reading layer `crime_dg' from data source 
##   `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\crime_dg.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 709898 features and 12 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -72.71865 ymin: 41.72215 xmax: -72.64829 ymax: 41.80744
## Geodetic CRS:  WGS 84
parcel_dg <- st_read(".//data//parcel_hartford.geojson")
## Reading layer `parcel_hartford' from data source 
##   `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\parcel_hartford.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 19143 features and 47 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -72.7182 ymin: 41.72368 xmax: -72.64208 ymax: 41.80743
## Geodetic CRS:  WGS 84

Census tracts from Tigris for map baselayer

hartford_census_data <- get_acs(geography = "tract", variable = c("B19013_001", "B01001_001"), state = "CT", county = "Hartford", year = 2021) |>
   dplyr::select(GEOID, variable, estimate) |>
   tidyr::pivot_wider(names_from = variable, values_from = estimate) |>
   dplyr::rename(population = "B01001_001", med_income = "B19013_001")

hartford_tracts <- st_filter(tracts(state = "CT"),
                             subset(county_subdivisions(state = "CT"), NAMELSAD == "Hartford town"),
                             .predicate = st_within)
hartford_tracts <- merge(hartford_tracts, hartford_census_data, by = "GEOID")

water <- st_intersection(area_water("CT", "Hartford"), hartford_tracts)
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
crime_dg <- crime_dg |> subset(year(Date) %in% 2016:2021) |> st_transform(st_crs(hartford_tracts))
parcel_dg <- parcel_dg |> subset(State_Use_Description == "ONE FAMILY") |> st_transform(st_crs(hartford_tracts))

plot

ggplot(crime_dg) +
   geom_sf(data = hartford_tracts) +
   geom_sf(data = water, color = "blue", linewidth = 0.35) +
   geom_hex(aes(X, Y, fill = log10(..count..)), data = ~cbind(.x, st_coordinates(.x)), alpha = 0.75, bins = 50) +
   scale_fill_binned() +
   scale_x_continuous(guide = guide_axis(angle = 45)) +
   labs(x = "Latitude", y = "Longitude") +
   theme_bw()

g <- ggplot(dplyr::sample_n(crime_dg, 1e3)) +
   geom_sf(data = hartford_tracts, aes(label1 = GEOID,
                                       label2 = med_income,
                                       label3 = population)) +
   geom_sf(data = dplyr::sample_n(parcel_dg, 1e3)) +
   geom_density_2d(aes(X,Y), data = ~cbind(.x, st_coordinates(.x))) +
   stat_sf_coordinates(size = 0.1, color = "red") +
   labs(x = "Latitude", y = "Longitude") +
   theme_bw() +
   theme(axis.text.x = element_text(angle = 45))

p <- toWebGL(ggplotly(g))
p$x$data[[4]]$hoverinfo <- "none"
p

Data Exploration

The filters applied to the data are:

The manually curated variables are:

The highly correlated numeric covariates are

There appear to be more thefts and violent crimes near single family homes in dilapidated condition. Homes in worse condition tend to be older, but there are also older homes in good condition.

There is not a clear relationship between the year the parcel was built and its condition.

All covariates appear to correlate with the response variable, Assessed_Total.

parcel_dg %<>% subset(select = c("Assessed_Total", "Living_Area", "Effective_Area", "AYB",
                                 "Number_of_Bedroom", "Number_of_Baths", "Condition_Description"))

parcel_dg %<>% dplyr::rename(Price = "Assessed_Total", Year = "AYB", Bed = "Number_of_Bedroom",
                             Bath = "Number_of_Baths", Condition = "Condition_Description")

parcel_dg$Condition %<>% factor(levels = c("Dilapidated", "Very Poor", "Poor", "Fair", "Fair-Avg", "Average",
                                           "Avg-Good", "Good", "Good-VG", "Very Good", "Excellent"))

paste0("Dropping n=", nrow(parcel_dg) - nrow(na.omit(parcel_dg)), " rows with NAs.")
## [1] "Dropping n=19 rows with NAs."
X <- na.omit(parcel_dg)
stealing <- crime_dg[grep("ROBBERY|BURGLARY|LARCENY|THEFT", crime_dg$UCR_1_Category), ]
violence <- crime_dg[grep("ASSAULT|HOMICIDE|SHOOTING", crime_dg$UCR_1_Category), ]

sf_use_s2(FALSE)
X$Thefts <- st_is_within_distance(st_transform(X, crs = 26956),
                                  st_transform(stealing, crs = 26956),
                                  dist = units::set_units(150, "m")) |> sapply(length)

X$Violence <- st_is_within_distance(st_transform(X, crs = 26956),
                                    st_transform(violence, crs = 26956),
                                    dist = units::set_units(150, "m")) |> sapply(length)
X$GEOID <- hartford_tracts$GEOID[st_intersects(X, hartford_tracts) |> sapply(head, 1)]

setDT(copy(X))[as.data.table(hartford_tracts),
               .(violence_rate = sum(Violence)/i.population,
                 avg_property_value = mean(Price)),
               on = "GEOID", by = .EACHI]
parcel_proj <- X |> st_transform(crs = 26956) |> st_centroid() |> st_coordinates()
## Warning: st_centroid assumes attributes are constant over geometries
stealing_proj <- stealing |> st_transform(crs = 26956) |> st_coordinates()
X$Thefts_kde <- kde(stealing_proj, Hscv(stealing_proj)) |> predict(x = parcel_proj)
X_plot_vars <- st_drop_geometry(X)[c("Thefts", "Violence", "Living_Area", "Effective_Area",
                                     "Year", "Bed", "Bath", "Condition", "Price")]

# Plot pairwise correlation between numeric predictors
X_plot_vars |>
   GGally::ggpairs(lower = list(continuous = "points")) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot boxplots of numeric predictors with respect to condition description
# Pivot data longer so that we can facet by variable
X_plot_vars |>
   tidyr::pivot_longer(!Condition) |>
   ggplot(aes(x = name, y = value, fill = Condition)) +
   geom_boxplot(position = position_dodge(width = 1)) + 
   facet_wrap(~name, scales = "free") + 
   labs(x = NULL, y = NULL) +
   theme_bw()

## Analysis

Variable Selection. We leave out Thefts since it is highly correlated with Violence but has a lower correlation with Price than Violence does. For similar reasons, we leave out Effective_Area and keep Living_Area. We keep both Bed and Bath for now.

Transformations. From the correlation plots, we can see that Price, Living_Area, and Violence are right-skewed. To adjust for this, we take the log of Price and the square root of Living_Area and Violence.

# Fit linear models with no transformations
m0 <- lm(Price ~ Violence + Living_Area + Year + Bed + Bath + Condition, 
         data = X)
summary(m0)
## 
## Call:
## lm(formula = Price ~ Violence + Living_Area + Year + Bed + Bath + 
##     Condition, data = X)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75062  -7023    856   6819 110854 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.458e+05  1.440e+04 -17.063  < 2e-16 ***
## Violence           -2.521e+02  6.990e+00 -36.065  < 2e-16 ***
## Living_Area         3.349e+01  3.255e-01 102.893  < 2e-16 ***
## Year                1.132e+02  6.656e+00  17.002  < 2e-16 ***
## Bed                -2.306e+03  2.293e+02 -10.061  < 2e-16 ***
## Bath                5.426e+03  3.556e+02  15.258  < 2e-16 ***
## ConditionVery Poor  6.029e+02  8.696e+03   0.069   0.9447    
## ConditionPoor       1.345e+04  6.813e+03   1.974   0.0484 *  
## ConditionFair       2.685e+04  6.408e+03   4.190 2.82e-05 ***
## ConditionFair-Avg   3.280e+04  6.258e+03   5.241 1.65e-07 ***
## ConditionAverage    4.180e+04  6.163e+03   6.784 1.27e-11 ***
## ConditionAvg-Good   4.936e+04  6.158e+03   8.017 1.26e-15 ***
## ConditionGood       5.025e+04  6.157e+03   8.161 3.90e-16 ***
## ConditionGood-VG    5.442e+04  6.176e+03   8.810  < 2e-16 ***
## ConditionVery Good  5.704e+04  6.179e+03   9.231  < 2e-16 ***
## ConditionExcellent  6.455e+04  6.330e+03  10.197  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13740 on 7204 degrees of freedom
## Multiple R-squared:  0.8252, Adjusted R-squared:  0.8248 
## F-statistic:  2267 on 15 and 7204 DF,  p-value: < 2.2e-16
# Plot diagnostics
par(mfrow = c(2, 2))
plot(m0)

# Fit linear models with transformations
m1 <- lm(log(Price) ~ sqrt(Violence) + sqrt(Living_Area) + Year + 
           Bed + Bath + Condition, 
         data = X)
summary(m1)
## 
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + sqrt(Living_Area) + 
##     Year + Bed + Bath + Condition, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.27511 -0.07695  0.01528  0.09805  0.72980 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.884e+00  1.771e-01  38.860  < 2e-16 ***
## sqrt(Violence)     -4.028e-02  9.175e-04 -43.908  < 2e-16 ***
## sqrt(Living_Area)   2.823e-02  3.803e-04  74.234  < 2e-16 ***
## Year                9.114e-04  8.078e-05  11.282  < 2e-16 ***
## Bed                -9.644e-03  2.818e-03  -3.422 0.000626 ***
## Bath                4.061e-02  4.161e-03   9.759  < 2e-16 ***
## ConditionVery Poor  7.605e-01  1.045e-01   7.279 3.71e-13 ***
## ConditionPoor       9.360e-01  8.185e-02  11.436  < 2e-16 ***
## ConditionFair       1.068e+00  7.699e-02  13.875  < 2e-16 ***
## ConditionFair-Avg   1.131e+00  7.519e-02  15.043  < 2e-16 ***
## ConditionAverage    1.377e+00  7.404e-02  18.603  < 2e-16 ***
## ConditionAvg-Good   1.500e+00  7.398e-02  20.271  < 2e-16 ***
## ConditionGood       1.517e+00  7.398e-02  20.507  < 2e-16 ***
## ConditionGood-VG    1.552e+00  7.421e-02  20.911  < 2e-16 ***
## ConditionVery Good  1.580e+00  7.424e-02  21.277  < 2e-16 ***
## ConditionExcellent  1.650e+00  7.605e-02  21.702  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1651 on 7204 degrees of freedom
## Multiple R-squared:  0.7622, Adjusted R-squared:  0.7617 
## F-statistic:  1540 on 15 and 7204 DF,  p-value: < 2.2e-16
plot(m1)

There are many parcels beyond 4 standard errors below the regression line. This means the model is severely overestimating the value of these parcels. Let’s find out what these parcels are.

Solution. Refer to the previous correlation plots and observe that Price and Living Area appear to have a strongly correlated linear correlation. However, we performed a log transformation on the Price variable but a square-root transformation on the Living Area variable. This asymmetry may be the cause of the large residuals.

# Investigate the large negative residuals
r <- residuals(m1) / summary(m1)$sigma # standard residuals
o <- cbind(X, "StdResid" = r)[r < -4, ]
nrow(o)
## [1] 31
# Plot the offending parcels geographically
g <- ggplot(o) +
   geom_sf(data = hartford_tracts) +
   stat_sf_coordinates(size = 1, color = "red") +
   labs(x = "Latitude", y = "Longitude") +
   theme_bw() + 
   theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations 
give.n <- function(x){
  return(data.frame(
    y = quantile(x, .75) + 0.1, 
    label = paste0("n = ", length(x))
    ))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) + 
  geom_boxplot() + 
  stat_summary(fun.data = give.n, geom = "text", fun.y = median) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme_bw()

# Plot residuals vs transformed numeric covariates
o <- o |>
   st_drop_geometry() |>
  mutate(logPrice = log(Price), 
         sqrtViolence = sqrt(Violence),
         sqrtLiving_Area = sqrt(Living_Area))
GGally::ggpairs(o, columns = c("StdResid", "logPrice", "sqrtViolence", "sqrtLiving_Area", "Bed", "Bath", "Thefts"), lower = list(continuous = "points"),
        progress = F) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Let’s test our hypothesis.

# Take log of living area
m2 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year + 
           Bed + Bath + Condition, 
         data = X)
summary(m2)
## 
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) + 
##     Year + Bed + Bath + Condition, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28251 -0.08783  0.01367  0.10248  1.09006 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.174e+00  2.016e-01  20.709  < 2e-16 ***
## sqrt(Violence)     -4.364e-02  9.532e-04 -45.777  < 2e-16 ***
## log(Living_Area)    5.548e-01  8.271e-03  67.070  < 2e-16 ***
## Year                7.687e-04  8.401e-05   9.150  < 2e-16 ***
## Bed                -8.052e-04  2.920e-03  -0.276    0.783    
## Bath                7.505e-02  4.174e-03  17.982  < 2e-16 ***
## ConditionVery Poor  7.686e-01  1.089e-01   7.059 1.84e-12 ***
## ConditionPoor       9.315e-01  8.531e-02  10.919  < 2e-16 ***
## ConditionFair       1.068e+00  8.025e-02  13.305  < 2e-16 ***
## ConditionFair-Avg   1.125e+00  7.838e-02  14.354  < 2e-16 ***
## ConditionAverage    1.365e+00  7.718e-02  17.691  < 2e-16 ***
## ConditionAvg-Good   1.485e+00  7.711e-02  19.252  < 2e-16 ***
## ConditionGood       1.500e+00  7.711e-02  19.450  < 2e-16 ***
## ConditionGood-VG    1.536e+00  7.735e-02  19.863  < 2e-16 ***
## ConditionVery Good  1.567e+00  7.738e-02  20.250  < 2e-16 ***
## ConditionExcellent  1.642e+00  7.927e-02  20.718  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1721 on 7204 degrees of freedom
## Multiple R-squared:  0.7417, Adjusted R-squared:  0.7411 
## F-statistic:  1379 on 15 and 7204 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(m2)

# Drop number of bedrooms
m3 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year + 
           Bath + Condition, 
         data = X)
summary(m3)
## 
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) + 
##     Year + Bath + Condition, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28246 -0.08786  0.01381  0.10273  1.08541 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.180e+00  2.005e-01  20.844  < 2e-16 ***
## sqrt(Violence)     -4.366e-02  9.488e-04 -46.017  < 2e-16 ***
## log(Living_Area)    5.535e-01  6.998e-03  79.099  < 2e-16 ***
## Year                7.694e-04  8.396e-05   9.164  < 2e-16 ***
## Bath                7.485e-02  4.105e-03  18.231  < 2e-16 ***
## ConditionVery Poor  7.685e-01  1.089e-01   7.058 1.84e-12 ***
## ConditionPoor       9.311e-01  8.530e-02  10.916  < 2e-16 ***
## ConditionFair       1.067e+00  8.023e-02  13.303  < 2e-16 ***
## ConditionFair-Avg   1.125e+00  7.836e-02  14.352  < 2e-16 ***
## ConditionAverage    1.365e+00  7.716e-02  17.690  < 2e-16 ***
## ConditionAvg-Good   1.484e+00  7.710e-02  19.252  < 2e-16 ***
## ConditionGood       1.499e+00  7.710e-02  19.449  < 2e-16 ***
## ConditionGood-VG    1.536e+00  7.733e-02  19.863  < 2e-16 ***
## ConditionVery Good  1.567e+00  7.737e-02  20.249  < 2e-16 ***
## ConditionExcellent  1.642e+00  7.926e-02  20.718  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.172 on 7205 degrees of freedom
## Multiple R-squared:  0.7416, Adjusted R-squared:  0.7411 
## F-statistic:  1477 on 14 and 7205 DF,  p-value: < 2.2e-16
plot(m3)

# Compare residuals beyond 4 SE's
r1 <- residuals(m1) / summary(m1)$sigma
sum(r1 < -4 | r1 > 4)
## [1] 33
r3 <- residuals(m3) / summary(m3)$sigma
sum(r3 < -4 | r3 > 4)
## [1] 22

Repeating the same residuals analysis from before, we see that there are no longer any strongly correlated covariates with the residuals.

# Investigate the large negative residuals
r <- residuals(m3) / summary(m3)$sigma # standard residuals
o <- cbind(X, "StdResid" = r)[r < -4, ]
nrow(o)
## [1] 18
# Plot the offending parcels geographically
g <- ggplot(o) +
  geom_sf(data = hartford_tracts) +
  stat_sf_coordinates(size = 1, color = "red") +
  labs(x = "Latitude", y = "Longitude") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations 
give.n <- function(x){
  return(data.frame(
    y = quantile(x, .75) + 0.1, 
    label = paste0("n = ", length(x))
    ))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) + 
  geom_boxplot() + 
  stat_summary(fun.data = give.n, geom = "text", fun.y = median) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme_bw()

# Plot residuals vs transformed numeric covariates
o <- o |>
   st_drop_geometry() |>
  mutate(logPrice = log(Price), 
         sqrtViolence = sqrt(Violence),
         logLiving_Area = log(Living_Area))
GGally::ggpairs(o, columns = c("StdResid", "logPrice", "sqrtViolence", "logLiving_Area", "Bath", "Thefts"), lower = list(continuous = "points"),
        progress = F) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))